Wrapper for Blocksparse CuTensor code#3057
Conversation
… to make it a union type of CuTensorBS and AbstractArray?
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/lib/cutensor/src/blocksparse/interfaces.jl b/lib/cutensor/src/blocksparse/interfaces.jl
index c6eef0e5b..0a479ddf8 100644
--- a/lib/cutensor/src/blocksparse/interfaces.jl
+++ b/lib/cutensor/src/blocksparse/interfaces.jl
@@ -1,4 +1,4 @@
-## For now call contract in ITensor and rely on UnallocatedArrays to make
+## For now call contract in ITensor and rely on UnallocatedArrays to make
## C in a dry-run of the contraction.
# function Base.:(*)(A::CuTensorBS, B::CuTensorBs)
# tC = promote_type(eltype(A), eltype(B))
@@ -18,11 +18,13 @@
using LinearAlgebra
function LinearAlgebra.mul!(C::CuTensorBS, A::CuTensorBS, B::CuTensorBS, α::Number, β::Number)
- contract!(α,
- A, A.inds, CUTENSOR_OP_IDENTITY,
- B, B.inds, CUTENSOR_OP_IDENTITY,
- β,
- C, C.inds, CUTENSOR_OP_IDENTITY,
- CUTENSOR_OP_IDENTITY; jit=CUTENSOR_JIT_MODE_DEFAULT)
- return C
-end
\ No newline at end of file
+ contract!(
+ α,
+ A, A.inds, CUTENSOR_OP_IDENTITY,
+ B, B.inds, CUTENSOR_OP_IDENTITY,
+ β,
+ C, C.inds, CUTENSOR_OP_IDENTITY,
+ CUTENSOR_OP_IDENTITY; jit = CUTENSOR_JIT_MODE_DEFAULT
+ )
+ return C
+end
diff --git a/lib/cutensor/src/blocksparse/operations.jl b/lib/cutensor/src/blocksparse/operations.jl
index 19542e5de..0f98c92ef 100644
--- a/lib/cutensor/src/blocksparse/operations.jl
+++ b/lib/cutensor/src/blocksparse/operations.jl
@@ -9,23 +9,26 @@ function contract!(
@nospecialize(beta::Number),
@nospecialize(C), Cinds::ModeType, opC::cutensorOperator_t,
opOut::cutensorOperator_t;
- jit::cutensorJitMode_t=JIT_MODE_NONE,
- workspace::cutensorWorksizePreference_t=WORKSPACE_DEFAULT,
- algo::cutensorAlgo_t=ALGO_DEFAULT,
- compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing}=nothing,
- plan::Union{CuTensorPlan, Nothing}=nothing)
+ jit::cutensorJitMode_t = JIT_MODE_NONE,
+ workspace::cutensorWorksizePreference_t = WORKSPACE_DEFAULT,
+ algo::cutensorAlgo_t = ALGO_DEFAULT,
+ compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing} = nothing,
+ plan::Union{CuTensorPlan, Nothing} = nothing
+ )
actual_plan = if plan === nothing
- plan_contraction(A, Ainds, opA, B, Binds, opB, C, Cinds, opC, opOut;
- jit, workspace, algo, compute_type)
+ plan_contraction(
+ A, Ainds, opA, B, Binds, opB, C, Cinds, opC, opOut;
+ jit, workspace, algo, compute_type
+ )
else
plan
end
contractBS!(actual_plan, alpha, nonzero_blocks(A), nonzero_blocks(B), beta, nonzero_blocks(C))
-
+
if plan === nothing
- CUDA.unsafe_free!(actual_plan)
+ CUDA.unsafe_free!(actual_plan)
end
return C
@@ -33,12 +36,14 @@ end
## This function assumes A, B, and C are Arrays of pointers to CuArrays.
## Please overwrite the `nonzero_blocks` function for your datatype to access this function from contract!
-function contractBS!(plan::CuTensorPlan,
- @nospecialize(alpha::Number),
- @nospecialize(A::AbstractArray),
- @nospecialize(B::AbstractArray),
- @nospecialize(beta::Number),
- @nospecialize(C::AbstractArray))
+function contractBS!(
+ plan::CuTensorPlan,
+ @nospecialize(alpha::Number),
+ @nospecialize(A::AbstractArray),
+ @nospecialize(B::AbstractArray),
+ @nospecialize(beta::Number),
+ @nospecialize(C::AbstractArray)
+ )
scalar_type = plan.scalar_type
# Extract GPU pointers from each CuArray block
@@ -46,11 +51,13 @@ function contractBS!(plan::CuTensorPlan,
A_ptrs = CuPtr{Cvoid}[pointer(block) for block in A]
B_ptrs = CuPtr{Cvoid}[pointer(block) for block in B]
C_ptrs = CuPtr{Cvoid}[pointer(block) for block in C]
-
- cutensorBlockSparseContract(handle(), plan,
- Ref{scalar_type}(alpha), A_ptrs, B_ptrs,
- Ref{scalar_type}(beta), C_ptrs, C_ptrs,
- plan.workspace, sizeof(plan.workspace), stream())
+
+ cutensorBlockSparseContract(
+ handle(), plan,
+ Ref{scalar_type}(alpha), A_ptrs, B_ptrs,
+ Ref{scalar_type}(beta), C_ptrs, C_ptrs,
+ plan.workspace, sizeof(plan.workspace), stream()
+ )
synchronize(stream())
return C
end
@@ -60,21 +67,22 @@ function plan_contraction(
@nospecialize(B), Binds::ModeType, opB::cutensorOperator_t,
@nospecialize(C), Cinds::ModeType, opC::cutensorOperator_t,
opOut::cutensorOperator_t;
- jit::cutensorJitMode_t=JIT_MODE_NONE,
- workspace::cutensorWorksizePreference_t=WORKSPACE_DEFAULT,
- algo::cutensorAlgo_t=ALGO_DEFAULT,
- compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing}=nothing)
+ jit::cutensorJitMode_t = JIT_MODE_NONE,
+ workspace::cutensorWorksizePreference_t = WORKSPACE_DEFAULT,
+ algo::cutensorAlgo_t = ALGO_DEFAULT,
+ compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing} = nothing
+ )
!is_unary(opA) && throw(ArgumentError("opA must be a unary op!"))
!is_unary(opB) && throw(ArgumentError("opB must be a unary op!"))
!is_unary(opC) && throw(ArgumentError("opC must be a unary op!"))
!is_unary(opOut) && throw(ArgumentError("opOut must be a unary op!"))
-
+
descA = CuTensorBSDescriptor(A)
descB = CuTensorBSDescriptor(B)
descC = CuTensorBSDescriptor(C)
# for now, D must be identical to C (and thus, descD must be identical to descC)
-
+
modeA = collect(Cint, Ainds)
modeB = collect(Cint, Binds)
modeC = collect(Cint, Cinds)
@@ -87,17 +95,19 @@ function plan_contraction(
desc = Ref{cutensorOperationDescriptor_t}()
- cutensorCreateBlockSparseContraction(handle(),
- desc,
- descA, modeA, opA,
- descB, modeB, opB,
- descC, modeC, opC,
- descC, modeC, actual_compute_type)
+ cutensorCreateBlockSparseContraction(
+ handle(),
+ desc,
+ descA, modeA, opA,
+ descB, modeB, opB,
+ descC, modeC, opC,
+ descC, modeC, actual_compute_type
+ )
plan_pref = Ref{cutensorPlanPreference_t}()
cutensorCreatePlanPreference(handle(), plan_pref, algo, jit)
- plan = CuTensorPlan(desc[], plan_pref[]; workspacePref=workspace)
+ plan = CuTensorPlan(desc[], plan_pref[]; workspacePref = workspace)
# cutensorDestroyOperationDescriptor(desc[])
cutensorDestroyPlanPreference(plan_pref[])
return plan
diff --git a/lib/cutensor/src/blocksparse/types.jl b/lib/cutensor/src/blocksparse/types.jl
index 292dc4d00..41cbebdbd 100644
--- a/lib/cutensor/src/blocksparse/types.jl
+++ b/lib/cutensor/src/blocksparse/types.jl
@@ -12,20 +12,26 @@ mutable struct CuTensorBS{T, N}
## This expects a Vector{Tuple(Int)} right now
nonzero_block_coords
- function CuTensorBS{T, N}(nonzero_data::Vector{<:CuArray},
- blocks_per_mode::Vector{Int}, block_extents, nonzero_block_coords, inds::Vector) where {T<:Number, N}
+ function CuTensorBS{T, N}(
+ nonzero_data::Vector{<:CuArray},
+ blocks_per_mode::Vector{Int}, block_extents, nonzero_block_coords, inds::Vector
+ ) where {T <: Number, N}
CuArrayT = eltype(nonzero_data)
@assert eltype(CuArrayT) == T
# @assert ndims(CuArrayT) == N
@assert length(block_extents) == N
- new(nonzero_data, inds, blocks_per_mode, block_extents, nonzero_block_coords)
+ return new(nonzero_data, inds, blocks_per_mode, block_extents, nonzero_block_coords)
end
end
-function CuTensorBS(nonzero_data::Vector{<:CuArray{T}},
- blocks_per_mode, block_extents, nonzero_block_coords, inds::Vector) where {T<:Number}
- CuTensorBS{T,length(block_extents)}(nonzero_data,
- blocks_per_mode, block_extents, nonzero_block_coords, inds)
+function CuTensorBS(
+ nonzero_data::Vector{<:CuArray{T}},
+ blocks_per_mode, block_extents, nonzero_block_coords, inds::Vector
+ ) where {T <: Number}
+ return CuTensorBS{T, length(block_extents)}(
+ nonzero_data,
+ blocks_per_mode, block_extents, nonzero_block_coords, inds
+ )
end
# array interface
function Base.size(T::CuTensorBS)
@@ -39,8 +45,8 @@ Base.strides(T::CuTensorBS) = vcat([[st...] for st in strides.(T.nonzero_data)].
Base.eltype(T::CuTensorBS) = eltype(eltype(T.nonzero_data))
function block_extents(T::CuTensorBS)
- extents = Vector{Int64}()
-
+ extents = Vector{Int64}()
+
for ex in T.block_extents
extents = vcat(extents, ex...)
end
@@ -66,18 +72,21 @@ mutable struct CuTensorBSDescriptor
handle::cutensorBlockSparseTensorDescriptor_t
# inner constructor handles creation and finalizer of the descriptor
function CuTensorBSDescriptor(
- numModes,
- numNonZeroBlocks,
- numSectionsPerMode,
- extent,
- nonZeroCoordinates,
- stride,
- eltype)
+ numModes,
+ numNonZeroBlocks,
+ numSectionsPerMode,
+ extent,
+ nonZeroCoordinates,
+ stride,
+ eltype
+ )
desc = Ref{cuTENSOR.cutensorBlockSparseTensorDescriptor_t}()
- cutensorCreateBlockSparseTensorDescriptor(handle(), desc,
- numModes, numNonZeroBlocks, numSectionsPerMode, extent, nonZeroCoordinates,
- stride, eltype)
+ cutensorCreateBlockSparseTensorDescriptor(
+ handle(), desc,
+ numModes, numNonZeroBlocks, numSectionsPerMode, extent, nonZeroCoordinates,
+ stride, eltype
+ )
obj = new(desc[])
finalizer(unsafe_destroy!, obj)
@@ -86,12 +95,13 @@ mutable struct CuTensorBSDescriptor
end
function CuTensorBSDescriptor(
- numModes,
- numNonZeroBlocks,
- numSectionsPerMode,
- extent,
- nonZeroCoordinates,
- eltype)
+ numModes,
+ numNonZeroBlocks,
+ numSectionsPerMode,
+ extent,
+ nonZeroCoordinates,
+ eltype
+ )
return CuTensorBSDescriptor(numModes, numNonZeroBlocks, numSectionsPerMode, extent, nonZeroCoordinates, C_NULL, eltype)
end
@@ -101,7 +111,7 @@ Base.show(io::IO, desc::CuTensorBSDescriptor) = @printf(io, "CuTensorBSDescripto
Base.unsafe_convert(::Type{cutensorBlockSparseTensorDescriptor_t}, obj::CuTensorBSDescriptor) = obj.handle
function unsafe_destroy!(obj::CuTensorBSDescriptor)
- cutensorDestroyBlockSparseTensorDescriptor(obj)
+ return cutensorDestroyBlockSparseTensorDescriptor(obj)
end
## Descriptor function for CuTensorBS type. Please overwrite for custom objects
@@ -110,11 +120,13 @@ function CuTensorBSDescriptor(A::CuTensorBS)
numNonZeroBlocks = Int64(length(A.nonzero_block_coords))
numSectionsPerMode = collect(Int32, A.blocks_per_mode)
extent = block_extents(A)
- nonZeroCoordinates = Int32.(vcat([[x...] for x in A.nonzero_block_coords]...) .- 1)
+ nonZeroCoordinates = Int32.(vcat([[x...] for x in A.nonzero_block_coords]...) .- 1)
st = strides(A)
- dataType = eltype(A)#convert(cuTENSOR.cutensorDataType_t, eltype(A))
+ dataType = eltype(A) #convert(cuTENSOR.cutensorDataType_t, eltype(A))
## Right now assume stride is NULL. I am not sure if stride works, need to discuss with cuTENSOR team.
- CuTensorBSDescriptor(numModes, numNonZeroBlocks,
- numSectionsPerMode, extent, nonZeroCoordinates, dataType)
+ return CuTensorBSDescriptor(
+ numModes, numNonZeroBlocks,
+ numSectionsPerMode, extent, nonZeroCoordinates, dataType
+ )
end
diff --git a/lib/cutensor/src/libcutensor.jl b/lib/cutensor/src/libcutensor.jl
index b33560b72..4e7ba168d 100644
--- a/lib/cutensor/src/libcutensor.jl
+++ b/lib/cutensor/src/libcutensor.jl
@@ -545,12 +545,12 @@ end
@gcsafe_ccall libcutensor.cutensorBlockSparseContract(handle::cutensorHandle_t,
plan::cutensorPlan_t,
alpha::Ptr{Cvoid},
- A::Ptr{CuPtr{Cvoid}},
- B::Ptr{CuPtr{Cvoid}},
+ A::Ptr{CuPtr{Cvoid}},
+ B::Ptr{CuPtr{Cvoid}},
beta::Ptr{Cvoid},
- C::Ptr{CuPtr{Cvoid}},
- D::Ptr{CuPtr{Cvoid}},
- workspace::CuPtr{Cvoid},
+ C::Ptr{CuPtr{Cvoid}},
+ D::Ptr{CuPtr{Cvoid}},
+ workspace::CuPtr{Cvoid},
workspaceSize::UInt64,
stream::cudaStream_t)::cutensorStatus_t
end
diff --git a/lib/cutensor/test/contractions.jl b/lib/cutensor/test/contractions.jl
index 636600a74..baf56949a 100644
--- a/lib/cutensor/test/contractions.jl
+++ b/lib/cutensor/test/contractions.jl
@@ -188,62 +188,73 @@ end
end
end
-eltypes_compact = [
- (Float32, Float32, Float32, Float32),
- (ComplexF32, ComplexF32, ComplexF32, Float32),
- (Float64, Float64, Float64, Float64),
- (ComplexF64, ComplexF64, ComplexF64, Float64)
-]
-@testset "Blocksparse Contraction" begin
- ## There are many unsupported types because this is a new functionality
- ## So I will test with Float32 and ComplexF32 only
- @testset for (eltyA, eltyB, eltyC, eltyCompute) in eltypes_compact
- ## i = [20,20,25]
- ## k = [10,10,15]
- ## l = [30,30,35]
- ## A = Tensor(k,i,l)
- ## Nonzero blocks are
- ## [1,1,1], [1,1,3], [1,3,1], [1,3,3], [3,1,1], [3,1,3], [3,3,1], [3,3,3]
- A = Vector{CuArray{eltyA, 3}}()
- for k in [10,15]
- for i in [20,25]
- for l in [30,35]
- push!(A, CuArray(ones(eltyA, k,i,l)))
+ eltypes_compact = [
+ (Float32, Float32, Float32, Float32),
+ (ComplexF32, ComplexF32, ComplexF32, Float32),
+ (Float64, Float64, Float64, Float64),
+ (ComplexF64, ComplexF64, ComplexF64, Float64),
+ ]
+ @testset "Blocksparse Contraction" begin
+ ## There are many unsupported types because this is a new functionality
+ ## So I will test with Float32 and ComplexF32 only
+ @testset for (eltyA, eltyB, eltyC, eltyCompute) in eltypes_compact
+ ## i = [20,20,25]
+ ## k = [10,10,15]
+ ## l = [30,30,35]
+ ## A = Tensor(k,i,l)
+ ## Nonzero blocks are
+ ## [1,1,1], [1,1,3], [1,3,1], [1,3,3], [3,1,1], [3,1,3], [3,3,1], [3,3,3]
+ A = Vector{CuArray{eltyA, 3}}()
+ for k in [10, 15]
+ for i in [20, 25]
+ for l in [30, 35]
+ push!(A, CuArray(ones(eltyA, k, i, l)))
+ end
end
end
- end
- ## B = Tensor(k,l)
- ## Nonzero blocks are
- ## [1,1], [2,3]
- B = Array{CuArray{eltyB, 2}}(
- [CuArray(randn(eltyB, 10, 30)),
- CuArray(randn(eltyB, 10, 35))])
-
- ## C = Tensor(i)
- ## Nonzero blocks are
- ## [1,], [3,]
- C = Vector{CuArray{eltyC, 1}}(
- [CuArray(zeros(eltyC, 20)),
- CuArray(zeros(eltyC, 25))]
- )
-
- cuTenA = cuTENSOR.CuTensorBS(A, [3,3,3],
- [(10,10,15), (20,20,25), (30,30,35)],
- [(1,1,1), (1,1,3), (1,3,1), (1,3,3), (3,1,1), (3,1,3), (3,3,1), (3,3,3)],
- [1,3,2])
- cuTenB = cuTENSOR.CuTensorBS(B, [3,3],
- [(10,10,15), (30,30,35)],
- [(1,1),(2,3)], [1,2], )
- cuTenC = cuTENSOR.CuTensorBS(C, [3],
- [(20,20,25)],[(1,),(3,)], [3])
-
- mul!(cuTenC, cuTenA, cuTenB, 1, 0)
- ## C[1] = A[1,1,1] * B[1,1]
- @test C[1] ≈ reshape(permutedims(A[1], (2,1,3)), (20, 10 * 30)) * reshape(B[1], (10 * 30))
- ## C[3] = A[1,3,1] * B[1,1]
- @test C[2] ≈ reshape(permutedims(A[3], (2,1,3)), (25, 10 * 30)) * reshape(B[1], (10 * 30))
+ ## B = Tensor(k,l)
+ ## Nonzero blocks are
+ ## [1,1], [2,3]
+ B = Array{CuArray{eltyB, 2}}(
+ [
+ CuArray(randn(eltyB, 10, 30)),
+ CuArray(randn(eltyB, 10, 35)),
+ ]
+ )
+
+ ## C = Tensor(i)
+ ## Nonzero blocks are
+ ## [1,], [3,]
+ C = Vector{CuArray{eltyC, 1}}(
+ [
+ CuArray(zeros(eltyC, 20)),
+ CuArray(zeros(eltyC, 25)),
+ ]
+ )
+
+ cuTenA = cuTENSOR.CuTensorBS(
+ A, [3, 3, 3],
+ [(10, 10, 15), (20, 20, 25), (30, 30, 35)],
+ [(1, 1, 1), (1, 1, 3), (1, 3, 1), (1, 3, 3), (3, 1, 1), (3, 1, 3), (3, 3, 1), (3, 3, 3)],
+ [1, 3, 2]
+ )
+ cuTenB = cuTENSOR.CuTensorBS(
+ B, [3, 3],
+ [(10, 10, 15), (30, 30, 35)],
+ [(1, 1), (2, 3)], [1, 2],
+ )
+ cuTenC = cuTENSOR.CuTensorBS(
+ C, [3],
+ [(20, 20, 25)], [(1,), (3,)], [3]
+ )
+
+ mul!(cuTenC, cuTenA, cuTenB, 1, 0)
+ ## C[1] = A[1,1,1] * B[1,1]
+ @test C[1] ≈ reshape(permutedims(A[1], (2, 1, 3)), (20, 10 * 30)) * reshape(B[1], (10 * 30))
+ ## C[3] = A[1,3,1] * B[1,1]
+ @test C[2] ≈ reshape(permutedims(A[3], (2, 1, 3)), (25, 10 * 30)) * reshape(B[1], (10 * 30))
+ end
end
-end
end |
|
There were some issues in the Clang.jl's conversion of the cuTENSOR.h file into Julia wrapper functions. Specifically I had a runtime issue when trying to convert arrays of cuarray into |
…mp5VT/CUDA.jl into kmp5/feature/wrap_blocksparse_cutensor
Codecov Report❌ Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## master #3057 +/- ##
===========================================
+ Coverage 76.94% 89.44% +12.49%
===========================================
Files 148 151 +3
Lines 12984 13149 +165
===========================================
+ Hits 9991 11761 +1770
+ Misses 2993 1388 -1605 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
There was a problem hiding this comment.
CUDA.jl Benchmarks
Details
| Benchmark suite | Current: f6fb806 | Previous: a79b516 | Ratio |
|---|---|---|---|
array/accumulate/Float32/1d |
101586 ns |
101309 ns |
1.00 |
array/accumulate/Float32/dims=1 |
76501 ns |
76747 ns |
1.00 |
array/accumulate/Float32/dims=1L |
1584479 ns |
1585609 ns |
1.00 |
array/accumulate/Float32/dims=2 |
143697 ns |
143412 ns |
1.00 |
array/accumulate/Float32/dims=2L |
657941.5 ns |
657151 ns |
1.00 |
array/accumulate/Int64/1d |
118759 ns |
118450 ns |
1.00 |
array/accumulate/Int64/dims=1 |
80090 ns |
79685 ns |
1.01 |
array/accumulate/Int64/dims=1L |
1695812 ns |
1694399 ns |
1.00 |
array/accumulate/Int64/dims=2 |
156035 ns |
155494.5 ns |
1.00 |
array/accumulate/Int64/dims=2L |
961216 ns |
961001 ns |
1.00 |
array/broadcast |
20647 ns |
20538 ns |
1.01 |
array/construct |
1338.9 ns |
1298.9 ns |
1.03 |
array/copy |
18626 ns |
18512 ns |
1.01 |
array/copyto!/cpu_to_gpu |
213900 ns |
213295 ns |
1.00 |
array/copyto!/gpu_to_cpu |
280728 ns |
284330.5 ns |
0.99 |
array/copyto!/gpu_to_gpu |
11409 ns |
11273 ns |
1.01 |
array/iteration/findall/bool |
131868 ns |
132165 ns |
1.00 |
array/iteration/findall/int |
148699 ns |
148572 ns |
1.00 |
array/iteration/findfirst/bool |
81490 ns |
81324.5 ns |
1.00 |
array/iteration/findfirst/int |
83632 ns |
83910 ns |
1.00 |
array/iteration/findmin/1d |
83334.5 ns |
88268.5 ns |
0.94 |
array/iteration/findmin/2d |
116864 ns |
116719 ns |
1.00 |
array/iteration/logical |
200977.5 ns |
201488.5 ns |
1.00 |
array/iteration/scalar |
68229 ns |
67192 ns |
1.02 |
array/permutedims/2d |
52473 ns |
52378 ns |
1.00 |
array/permutedims/3d |
53112 ns |
52726 ns |
1.01 |
array/permutedims/4d |
51671.5 ns |
51596 ns |
1.00 |
array/random/rand/Float32 |
13170 ns |
13097 ns |
1.01 |
array/random/rand/Int64 |
35181 ns |
37319 ns |
0.94 |
array/random/rand!/Float32 |
8584 ns |
8581.666666666666 ns |
1.00 |
array/random/rand!/Int64 |
28967 ns |
34312 ns |
0.84 |
array/random/randn/Float32 |
44197.5 ns |
38478.5 ns |
1.15 |
array/random/randn!/Float32 |
31371 ns |
31422.5 ns |
1.00 |
array/reductions/mapreduce/Float32/1d |
34821 ns |
34936 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=1 |
39799 ns |
49501 ns |
0.80 |
array/reductions/mapreduce/Float32/dims=1L |
51781 ns |
51907 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=2 |
56728 ns |
56747.5 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=2L |
69174 ns |
69513 ns |
1.00 |
array/reductions/mapreduce/Int64/1d |
42943 ns |
43154 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=1 |
42715.5 ns |
43838 ns |
0.97 |
array/reductions/mapreduce/Int64/dims=1L |
87846 ns |
87668 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=2 |
59476 ns |
59424 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=2L |
84531 ns |
84576 ns |
1.00 |
array/reductions/reduce/Float32/1d |
34899 ns |
34859 ns |
1.00 |
array/reductions/reduce/Float32/dims=1 |
49342.5 ns |
39947.5 ns |
1.24 |
array/reductions/reduce/Float32/dims=1L |
51974 ns |
51723 ns |
1.00 |
array/reductions/reduce/Float32/dims=2 |
56893 ns |
56768 ns |
1.00 |
array/reductions/reduce/Float32/dims=2L |
69726 ns |
69769.5 ns |
1.00 |
array/reductions/reduce/Int64/1d |
43136 ns |
42778 ns |
1.01 |
array/reductions/reduce/Int64/dims=1 |
44599.5 ns |
44289 ns |
1.01 |
array/reductions/reduce/Int64/dims=1L |
87637 ns |
87701 ns |
1.00 |
array/reductions/reduce/Int64/dims=2 |
59524 ns |
59510 ns |
1.00 |
array/reductions/reduce/Int64/dims=2L |
84649 ns |
84815 ns |
1.00 |
array/reverse/1d |
18586 ns |
18338 ns |
1.01 |
array/reverse/1dL |
69143 ns |
68805 ns |
1.00 |
array/reverse/1dL_inplace |
65998 ns |
65983 ns |
1.00 |
array/reverse/1d_inplace |
8560.666666666666 ns |
8621.333333333334 ns |
0.99 |
array/reverse/2d |
20799 ns |
20615 ns |
1.01 |
array/reverse/2dL |
72745 ns |
72573 ns |
1.00 |
array/reverse/2dL_inplace |
66034 ns |
66098 ns |
1.00 |
array/reverse/2d_inplace |
10225 ns |
10260 ns |
1.00 |
array/sorting/1d |
2735348 ns |
2735030 ns |
1.00 |
array/sorting/2d |
1072127 ns |
1071674 ns |
1.00 |
array/sorting/by |
3305263 ns |
3313782 ns |
1.00 |
cuda/synchronization/context/auto |
1190.3 ns |
1186.2 ns |
1.00 |
cuda/synchronization/context/blocking |
935.8888888888889 ns |
924.0487804878048 ns |
1.01 |
cuda/synchronization/context/nonblocking |
7990.299999999999 ns |
7835.8 ns |
1.02 |
cuda/synchronization/stream/auto |
1049.1 ns |
1041.2 ns |
1.01 |
cuda/synchronization/stream/blocking |
795.5098039215686 ns |
835.7402597402597 ns |
0.95 |
cuda/synchronization/stream/nonblocking |
7411.2 ns |
7438.2 ns |
1.00 |
integration/byval/reference |
144267 ns |
144123 ns |
1.00 |
integration/byval/slices=1 |
146256 ns |
146064 ns |
1.00 |
integration/byval/slices=2 |
285254 ns |
284754 ns |
1.00 |
integration/byval/slices=3 |
423668 ns |
423302 ns |
1.00 |
integration/cudadevrt |
102841 ns |
102654 ns |
1.00 |
integration/volumerhs |
9446008 ns |
9450427 ns |
1.00 |
kernel/indexing |
13429 ns |
13382 ns |
1.00 |
kernel/indexing_checked |
14148 ns |
14092 ns |
1.00 |
kernel/launch |
2091.8 ns |
2292.8888888888887 ns |
0.91 |
kernel/occupancy |
665.9875 ns |
675.4013157894736 ns |
0.99 |
kernel/rand |
14983 ns |
17995 ns |
0.83 |
latency/import |
3799121920 ns |
3823445090 ns |
0.99 |
latency/precompile |
4578702155 ns |
4598939035 ns |
1.00 |
latency/ttfp |
4381104662.5 ns |
4399692793 ns |
1.00 |
This comment was automatically generated by workflow using github-action-benchmark.
|
Thanks very much for putting this together, I'm happy to help with the header issues if needed! |
…but the C++ code is still in flux)
|
@kshyatt I removed the extra code, made the functions that linked to the library relatively agnostic (i.e. you are not forced to use CuTensorBS but can buy in if you'd like) and added a unit test. If you could help with the Clang.jl issue, that would be amazing! |
|
I'll try to take a look today! |
|
Did you use the scripts in |
…mp5VT/CUDA.jl into kmp5/feature/wrap_blocksparse_cutensor
Yes I did use the scripts but this produced the ERROR: MethodError: no method matching unsafe_convert(::Type{Ptr{Nothing}}, ::CuPtr{Nothing})
The function `unsafe_convert` exists, but no method is defined for this combination of argument types.
Closest candidates are:
unsafe_convert(::Type{Ptr{Nothing}}, ::LibGit2.GitBlame)
@ LibGit2 ~/.julia/juliaup/julia-1.12.1+0.x64.linux.gnu/share/julia/stdlib/v1.12/LibGit2/src/types.jl:1096
unsafe_convert(::Type{Ptr{Nothing}}, ::LibGit2.GitRevWalker)
@ LibGit2 ~/.julia/juliaup/julia-1.12.1+0.x64.linux.gnu/share/julia/stdlib/v1.12/LibGit2/src/types.jl:1096
unsafe_convert(::Type{Ptr{Nothing}}, ::LibGit2.GitDiffStats)
@ LibGit2 ~/.julia/juliaup/julia-1.12.1+0.x64.linux.gnu/share/julia/stdlib/v1.12/LibGit2/src/types.jl:1096
...
Stacktrace:
[1] Ref{Ptr{Nothing}}(a::Vector{CuPtr{Nothing}})
@ Base ./refpointer.jl:166
[2] cconvert
@ ./refpointer.jl:178 [inlined]
[3] macro expansion
@ ~/.julia/dev/CUDA.jl/lib/cutensor/src/libcutensor.jl:545 [inlined]
[4] (::cuTENSOR.var"#cutensorBlockSparseContract##0#cutensorBlockSparseContract##1"{…})()
@ cuTENSOR ~/.julia/packages/GPUToolbox/JLBB1/src/ccalls.jl:34
[5] retry_reclaim
@ ~/.julia/packages/CUDA/Il00B/src/memory.jl:434 [inlined]
[6] check
@ ~/.julia/dev/CUDA.jl/lib/cutensor/src/libcutensor.jl:22 [inlined]
[7] cutensorBlockSparseContract
@ ~/.julia/packages/GPUToolbox/JLBB1/src/ccalls.jl:33 [inlined]
[8]
@ cuTENSOR ~/.julia/dev/CUDA.jl/lib/cutensor/src/blocksparse/operations.jl:50
[9] contract!(alpha::Number, A::Any, Ainds::Vector{…}, opA::cuTENSOR.cutensorOperator_t, B::Any, Binds::Vector{…}, opB::cuTENSOR.cutensorOperator_t, beta::Number, C::Any, Cinds::Vector{…}, opC::cuTENSOR.cutensorOperator_t, opOut::cuTENSOR.cutensorOperator_t; jit::cuTENSOR.cutensorJitMode_t, workspace::cuTENSOR.cutensorWorksizePreference_t, algo::cuTENSOR.cutensorAlgo_t, compute_type::Nothing, plan::Nothing)
@ cuTENSOR ~/.julia/dev/CUDA.jl/lib/cutensor/src/blocksparse/operations.jl:25
[10] mul!(C::CuTensorBS{Float64, 1}, A::CuTensorBS{Float64, 3}, B::CuTensorBS{Float64, 2}, α::Float64, β::Float64)
@ cuTENSOR ~/.julia/dev/CUDA.jl/lib/cutensor/src/blocksparse/interfaces.jl:21However, I found that If I modify the code to be |
|
Probably you missed some of the weird esoterica in |
17806da to
cc4b826
Compare
|
Thanks for doing all the work to get this going, I think it will be quite useful for a bunch of TN packages... |
lkdvos
left a comment
There was a problem hiding this comment.
Left some remaining comments, but for me I think most of the parts that I would use are there, since I don't really see myself going through the CuTensorBS construction (we also never used the CuTensor in TensorOperations so that is completely fine)
| mutable struct CuTensorBSDescriptor | ||
| handle::cutensorBlockSparseTensorDescriptor_t | ||
| # inner constructor handles creation and finalizer of the descriptor | ||
| function CuTensorBSDescriptor( |
There was a problem hiding this comment.
I think it would be both helpful for clarity/self-documentation and for avoiding hard to decypher errors to restrict the types of these arguments in the inner constructor. This would also be more in line with the CuTensorDescriptor type + constructors.
| extent, | ||
| nonZeroCoordinates, | ||
| eltype) | ||
|
|
There was a problem hiding this comment.
Perhaps a comment here to indicate which argument is filled in as C_NULL and what that means might be helpful
There was a problem hiding this comment.
I added a dummy input comment to show that strides=C_NULL and a comment.
| function CuTensorBSDescriptor(A::CuTensorBS) | ||
| numModes = Int32(ndims(A)) | ||
| numNonZeroBlocks = Int64(length(A.nonzero_block_coords)) | ||
| numSectionsPerMode = collect(Int32, A.blocks_per_mode) |
There was a problem hiding this comment.
If this has to be Int32, would it not be easier to immediately make that type restriction in the CuTensorBS type?
| numNonZeroBlocks = Int64(length(A.nonzero_block_coords)) | ||
| numSectionsPerMode = collect(Int32, A.blocks_per_mode) | ||
| extent = block_extents(A) | ||
| nonZeroCoordinates = Int32.(vcat([[x...] for x in A.nonzero_block_coords]...) .- 1) |
There was a problem hiding this comment.
Same comment here. It also seems slightly strange to me to have a different storage format from the type that is required for the contraction, as this seems to introduce some allocations that could possibly be avoided?
| st = strides(A) | ||
| dataType = eltype(A)#convert(cuTENSOR.cutensorDataType_t, eltype(A)) | ||
|
|
||
| ## Right now assume stride is NULL. I am not sure if stride works, need to discuss with cuTENSOR team. |
There was a problem hiding this comment.
Can we add an assert that the strides are the "natural ones" for that in the meantime?
Remove left over code. Will need to make something like this to define mul! in the future Co-authored-by: Lukas Devos <ldevos98@gmail.com>
Hi,
This is a wrapper type and functions to access the newly introduced blocksparse cutensor backend. Right now the code is expert level, i.e. users need to write a type that converts their object to CuTensorBS types or can achieve the low-level operations required by cutensor kernels. I am still writing a test but the code is fully operational.
Thanks,
Karl